In [1]:
from sklearn import svm
from sklearn import metrics
from sklearn.linear_model import LogisticRegression

import scipy.io as sio

I think the hard part is how to vecotrize emails.
Using this preprocessed data set is cheating XD


In [2]:
mat_tr = sio.loadmat('data/spamTrain.mat')
mat_tr.keys()


Out[2]:
dict_keys(['__header__', '__globals__', 'y', '__version__', 'X'])

be careful with the column vector : (4000, 1) is not the same as (4000, )


In [3]:
X, y = mat_tr.get('X'), mat_tr.get('y').ravel()
X.shape, y.shape


Out[3]:
((4000, 1899), (4000,))

In [4]:
mat_test = sio.loadmat('data/spamTest.mat')
mat_test.keys()


Out[4]:
dict_keys(['__header__', '__globals__', 'ytest', '__version__', 'Xtest'])

In [5]:
test_X, test_y = mat_test.get('Xtest'), mat_test.get('ytest').ravel()
test_X.shape, test_y.shape


Out[5]:
((1000, 1899), (1000,))

fit SVM model


In [6]:
svc = svm.SVC()

In [7]:
svc.fit(X, y)


Out[7]:
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
pred = svc.predict(test_X)
print(metrics.classification_report(test_y, pred))


             precision    recall  f1-score   support

          0       0.94      0.99      0.97       692
          1       0.98      0.87      0.92       308

avg / total       0.95      0.95      0.95      1000

what about linear logistic regresion?


In [9]:
logit = LogisticRegression()
logit.fit(X, y)


Out[9]:
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [10]:
pred = logit.predict(test_X)
print(metrics.classification_report(test_y, pred))


             precision    recall  f1-score   support

          0       1.00      0.99      1.00       692
          1       0.99      0.99      0.99       308

avg / total       0.99      0.99      0.99      1000

.......... then what for.... SVM